# Wrangling
library(tidyverse)
library(mgsub)
# Statistics/Numerical processing
library(brms)
# PCA
library(FactoMineR)
library(factoextra)
library(GPArotation)
library(paran)
# Plotting
library(ggplot2)
library(ghibli)
# Optional settings
options(dplyr.summarise.inform=F) # Stop dplyr from printing summarise error (that isn't an error)
select <- dplyr::select # Ensure that select() command is the dplyr command (clashes with MASS, which is imported/required by paran)
# Standard error function
std.error <- function(x, na.rm = T) {
sqrt(var(x, na.rm = na.rm)/length(x[complete.cases(x)]))
}
# ggplot theme
gg_theme <- function() {
theme_bw() +
theme(plot.title=element_text(size=25),
plot.subtitle=element_text(size=15, face="italic"),
axis.title=element_text(size=20),
axis.text=element_text(size=15),
strip.background =element_rect(fill="white"),
strip.text = element_text(size=15))+
theme(legend.title = element_text(size=15, face="bold"),
legend.text=element_text(size=15))
}
Read in the data from downloaded CSV files from Firebase. Also remove data from subjects who did not complete the study (“returned” on Prolific) or have been identified to be outliers, not following instructions, not fulfilling my participant requirements, etc. Outliers are dentified in a later section below (Outlier Check), while participant requirements are checked in the data from the questionnaire.
# List results files per subject
filelist <- list.files(path="./data/axb_1a/perception/", pattern=".csv",full.names=TRUE)
# Check Number of files
paste("Total number of perception results files in directory:",length(filelist))
[1] "Total number of perception results files in directory: 134"
paste("Expected number of questionnaire files:",length(filelist)-2-2) # -2 (repeats) -2 (returns)
[1] "Expected number of questionnaire files: 130"
In this case, when reading in the data by participant file, the experiment start time is extracted from the structure ./data/axb_XX/perception/subjnumber_starttime.csv where I first remove the “.csv” with gsub, split the string into three based on "_", then select the third item from the first output object ([[1]][3]).
# Read and Concatenate results
## (1) Just read and concat
# condata.read <- do.call(rbind, lapply(filelist, read.csv))
## (2) Read, concat and extract part of filename
condata.read <- do.call(rbind, lapply(filelist, function(x) cbind(read.csv(x),
starttime=strsplit(gsub(".csv","",x), "_")[[1]][3])))
## TODO: Update with subject numbers as necessary
condata.read <- condata.read %>%
# Fix data with undefined subject
mutate(participantId = replace_na(participantId, '5d1e2045a37a4d001a1fc2cb')) %>%
# Remove data from dropped subjects
subset(participantId != "5f4fc3f3e037580c36d1808e") %>% #only half data --- condA
subset(participantId != "5f2d6b6fb5f3f201942d3eb6") %>% #grew up in IL --- condB
subset(participantId != "5f5353ff017c5965165df065") %>% #grew up in TX --- condB
subset(participantId != "5c904569bc1a950016fbfc91") %>% #grew up in NE --- condB
subset(participantId != "5e57501c1893b21054dbf993") %>% #too many unusable data --- condD
subset(participantId != "5c5067d2c879e900017cde4c") %>% #too many unusable data --- condD
subset(participantId != "5eb2f7d3c2c3c90c11cbb1cf") %>% #too many unusable data --- condD
subset(participantId != "5e267b418e5af58d037f70f0") %>% #majority unusable data --- condE
subset(participantId != "5d2c29f9917e53001a3a5c9d") %>% #majority unusable data --- condE
subset(participantId != "5c961ee66357fb0001ef4b4a") %>% #too many unusable data --- condE#axbdata.ind.s3
subset(participantId != "5c659f77d0fb37000172f5dd") %>% #too many unusable data; no LBQ data --- condE
subset(participantId != "5c8f224c9d2bff0001867ce2") %>% #returned submission (no LBQ data) --- condE
subset(participantId != "5f5cc6062074a44b9880e431") %>% #majority unusable data --- condF
subset(participantId != "5aa02ab689de8200013f3dab") %>% #timed out submission --- condF
# Remove levels of dropped subjects
droplevels()
# Trim unnecesssary columns and rows
condata <- subset(condata.read,,-c(url, internal_node_id, view_history, stimulus, success, key_press, trial_index,
trial_type, wordStim))
# Keep only tonetest and test rows
# Create subject number column by order of participation (first sorting subjects by starttime, then adding subjnum column)
# Recover vowel data from sentNum column (== Vowel data was accidently left out of stimuli data)
# Manipulate data types
# Reorder columns
condata <- condata %>%
subset(trial_role == "test" | trial_role == "tonetest") %>%
#arrange(desc(starttime)) %>%
#mutate(subjNum = as.factor(rep(1:nSubj, each=(nrow(.)/nSubj)))) %>%
mutate(vowel = case_when(between(sentNum, 11, 20) ~ "AU",
between(sentNum, 21, 30) ~ "AI")) %>%
mutate(guiseCombination = case_when(conditionId == "condA" | conditionId == "condB" ~ "match",
conditionId == "condC" | conditionId == "condD" ~ "mismatch",
conditionId == "condE" | conditionId == "condF" ~ "baseline")) %>%
mutate(speakerOrder = case_when(conditionId == "condA" | conditionId == "condC" | conditionId == "condE" ~ "S3-S9",
conditionId == "condB" | conditionId == "condD" | conditionId == "condF" ~ "S9-S3")) %>%
mutate(rt=as.numeric(rt)) %>%
mutate_if(is.character, as.factor) %>%
mutate(time_elapsed_sec = time_elapsed/1000, rt_sec = rt/1000) %>%
select(participantId, conditionId, trial_role, time_elapsed, time_elapsed_sec, rt, rt_sec, everything())
# Check data
condata
#summary(condata)
# Check number of data points per subject
# Correct number of data points is 174/180/186 = (168 + 6/12/18)
(nData.bysubj <- condata %>%
group_by(conditionId, participantId, starttime) %>%
count())
# Check number of data points per condition (goal: 40 per condition)
(nSubj.bycond <- nData.bysubj %>%
group_by(conditionId) %>%
count())
Troubleshoot participant issues manually and/or by filtering duplicate subject files or those with too few responses.
# Manually scroll to check data as needed
View(nData.bysubj)
# Troubleshoot duplicated participants
## Calculate number of subjects vs data files
paste("Total number of screened perception data files:",nrow(nData.bysubj)) # number of data files
[1] "Total number of screened perception data files: 120"
paste("Total number of unique participant numbers:",length(unique(condata.read$participantId))) # unique subj nums
[1] "Total number of unique participant numbers: 120"
## Identify dups
nData.bysubj$dups = duplicated(nData.bysubj$participantId)
nData.bysubj %>% filter(dups==TRUE)
# Troubleshoot half data
nData.bysubj %>% filter(n<342)
NA
# select "Tonetest" data
# Check which participants got 4/6 or below on the headphone/attention check
condata.tones <- condata %>%
subset(trial_role == "tonetest") %>%
mutate(correct_response=tolower(correct_response)) %>%
group_by(participantId, conditionId) %>%
slice_max(order_by = time_elapsed, n = 6) # get last 6 tone trials, by largest time_elapsed
condata.tones <- condata.tones %>%
group_by(participantId, conditionId) %>%
summarise(tonesCorrect = sum(correct_response=="true")) %>%
ungroup()
condata.tones
# Troubleshoot half data
condata.tones %>% filter(tonesCorrect<5)
# Load stim.durations.final dataframe
load(file="./data/stim_durations.rData")
# Select "Test" data + remove columns for Tonetest data
# Merge tonesCorrect column in for later decisions (e.g. if remove people with low score in tone test)
condata.test <- condata %>%
subset(trial_role == "test") %>%
subset(.,,-c(button_pressed, correct_answer, correct_response)) %>%
droplevels() %>%
merge(., condata.tones, all.x=TRUE) %>%
merge(., stim.durations.final, all.x=TRUE) %>%
select(participantId, conditionId, trial_role, time_elapsed, time_elapsed_sec, rt, rt_sec, raised_response, everything())
condata.test
# Check data for obvious issues
summary(condata.test)
participantId conditionId trial_role time_elapsed time_elapsed_sec rt rt_sec raised_response vowel speaker
5610c4ea7ffc8a0005811504: 336 condA:6720 test:40320 Min. : 22385 Min. : 22.39 Min. : 0 Min. : 0.000 false:19849 AI:20160 S3:20160
57509d9b363e77000695620b: 336 condB:6720 1st Qu.: 761982 1st Qu.: 761.98 1st Qu.: 3262 1st Qu.: 3.262 true :20471 AU:20160 S9:20160
5765b92cf96b100001f64c9a: 336 condC:6720 Median :1263204 Median :1263.20 Median : 3592 Median : 3.592
57901c44900cc80001d2e9c7: 336 condD:6720 Mean :1358107 Mean :1358.11 Mean : 3992 Mean : 3.991
59302683d0003800018f39b0: 336 condE:6720 3rd Qu.:1785733 3rd Qu.:1785.73 3rd Qu.: 4049 3rd Qu.: 4.049
596016f51d5c3e00012f03c3: 336 condF:6720 Max. :6413055 Max. :6413.06 Max. :2722416 Max. :2722.416
(Other) :38304
sentNum order step speakerIdentity speakerGuise guiseName raised_answer key_response starttime guiseCombination speakerOrder
Min. :11.00 axb:20160 Min. :2 CN:20160 BL:13440 S3-BL:6720 Min. :0.0 Min. :0.0000 1599320556796: 336 baseline:13440 S3-S9:20160
1st Qu.:18.75 bxa:20160 1st Qu.:3 MI:20160 CN:13440 S3-CN:6720 1st Qu.:0.0 1st Qu.:0.0000 1599320712659: 336 match :13440 S9-S3:20160
Median :20.50 Median :5 MI:13440 S3-MI:6720 Median :0.5 Median :1.0000 1599321536428: 336 mismatch:13440
Mean :21.33 Mean :5 S9-BL:6720 Mean :0.5 Mean :0.6014 1599321623488: 336
3rd Qu.:23.50 3rd Qu.:7 S9-CN:6720 3rd Qu.:1.0 3rd Qu.:1.0000 1599321799102: 336
Max. :30.00 Max. :8 S9-MI:6720 Max. :1.0 Max. :1.0000 1599322002660: 336
(Other) :38304
tonesCorrect dur_sec dur_ms quart_dur_sec quart_dur_ms
Min. :0.000 Min. :1.799 Min. :1799 Min. :1.349 Min. :1349
1st Qu.:2.750 1st Qu.:2.260 1st Qu.:2260 1st Qu.:1.695 1st Qu.:1695
Median :6.000 Median :2.506 Median :2506 Median :1.880 Median :1880
Mean :4.392 Mean :2.464 Mean :2464 Mean :1.848 Mean :1848
3rd Qu.:6.000 3rd Qu.:2.678 3rd Qu.:2678 3rd Qu.:2.009 3rd Qu.:2009
Max. :6.000 Max. :3.022 Max. :3022 Max. :2.266 Max. :2266
# Check original data points
(datapoints.og <- nrow(condata.test))
[1] 40320
# What are the descriptive stats on total experiment time and reaction times?
condata.test.bysubj <- condata.test %>%
group_by(participantId, conditionId) %>%
summarize(time_elapsed_min = max(time_elapsed_sec/60), mean_rt_sec = mean(rt_sec), min_rt_sec = min(rt_sec), max_rt_sec = max(rt_sec), sd_rt_sec = sd(rt_sec)) %>%
ungroup()
head(condata.test.bysubj)
# Summary of total experiment times
# Check for especially short or long times
condata.test.overall <- condata.test.bysubj %>%
summarize(median_time = median(time_elapsed_min), mean_time= mean(time_elapsed_min), min_time = min(time_elapsed_min), max_time = max(time_elapsed_min))
condata.test.overall
# RT Outlier check
# Calculate response times that are at least 3 SDs away from the mean
condata.test.timesum <- condata.test %>%
summarize(meanTime = mean(rt), sdTime = sd(rt), minTime = min(rt), maxTime = max(rt), medianTime = median(rt), iqrTime = IQR(rt), meanStimTime = mean(dur_ms)) %>%
mutate(sd3 = sdTime*3, iqr3 = iqrTime*3, stimTime10 = meanStimTime+10000)
condata.test.timesum
# Add columns of outlier criteria
condata.test.check <- condata.test %>%
mutate(rt.outlier.lower = rt < quart_dur_ms, rt.outlier.upper = rt > (dur_ms + 10000))
# Check list of outliers that were removed
condata.test.outliers <- condata.test.check %>%
subset(rt.outlier.lower == TRUE | rt.outlier.upper == TRUE)
summary(condata.test.outliers)
participantId conditionId trial_role time_elapsed time_elapsed_sec rt rt_sec raised_response vowel speaker
5c7d960e461e90000179f6d6: 31 condA:39 test:350 Min. : 40523 Min. : 40.52 Min. : 0.0 Min. : 0.0000 false:186 AI:164 S3:182
5ebac5b2ae6d130365b0ed08: 22 condB:72 1st Qu.: 789082 1st Qu.: 789.08 1st Qu.: 909.5 1st Qu.: 0.9095 true :164 AU:186 S9:168
5f05ece827813a0da11cd397: 22 condC:45 Median :1461233 Median :1461.23 Median : 13130.5 Median : 13.1305
5e72318254cb2015a07ef417: 18 condD:67 Mean :1514033 Mean :1514.03 Mean : 25352.7 Mean : 25.3527
5e521b2996dbff237ca41f84: 17 condE:92 3rd Qu.:2104518 3rd Qu.:2104.52 3rd Qu.: 18501.8 3rd Qu.: 18.5018
5e746f7a64051f3d90ff620f: 16 condF:35 Max. :4216455 Max. :4216.45 Max. :2722416.0 Max. :2722.4160
(Other) :224
sentNum order step speakerIdentity speakerGuise guiseName raised_answer key_response starttime guiseCombination speakerOrder
Min. :11.00 axb:176 Min. :2.000 CN:168 BL:127 S3-BL:45 Min. :0.0000 Min. :0.0000 1600713995143: 31 baseline:127 S3-S9:176
1st Qu.:18.00 bxa:174 1st Qu.:3.000 MI:182 CN:108 S3-CN:67 1st Qu.:0.0000 1st Qu.:0.0000 1599354613292: 22 match :111 S9-S3:174
Median :20.00 Median :5.000 MI:115 S3-MI:70 Median :0.0000 Median :1.0000 1599579676992: 22 mismatch:112
Mean :21.09 Mean :4.994 S9-BL:82 Mean :0.4971 Mean :0.5771 1600800079717: 18
3rd Qu.:23.00 3rd Qu.:7.000 S9-CN:41 3rd Qu.:1.0000 3rd Qu.:1.0000 1599863886680: 17
Max. :30.00 Max. :8.000 S9-MI:45 Max. :1.0000 Max. :1.0000 1599402567115: 16
(Other) :224
tonesCorrect dur_sec dur_ms quart_dur_sec quart_dur_ms rt.outlier.lower rt.outlier.upper
Min. :0.000 Min. :1.799 Min. :1799 Min. :1.349 Min. :1349 Mode :logical Mode :logical
1st Qu.:2.000 1st Qu.:2.272 1st Qu.:2272 1st Qu.:1.704 1st Qu.:1704 FALSE:194 FALSE:156
Median :3.000 Median :2.540 Median :2540 Median :1.905 Median :1905 TRUE :156 TRUE :194
Mean :3.491 Mean :2.488 Mean :2488 Mean :1.866 Mean :1866
3rd Qu.:6.000 3rd Qu.:2.682 3rd Qu.:2682 3rd Qu.:2.011 3rd Qu.:2011
Max. :6.000 Max. :3.022 Max. :3022 Max. :2.266 Max. :2266
# Summarize number of outliers attributed to each participant
condata.outliers.bysubj <- condata.test.outliers %>% #filter(conditionId=="condC") %>%
group_by(participantId, conditionId) %>%
count(sort=TRUE)
condata.outliers.bysubj
View(condata.test.outliers)
# CondA
#19/336 #0.05654762 subj 5f4fc3f3e037580c36d1808e --- removed (for other reasons)
12/336 #0.03571429 subj 5f498900c3368e3df50ea6ac
4/336 #0.01190476 subj 5c698bdf707a740001032f4e, 5cf308da81ea050017a6151e, 5f5065f8c62ac41e03484ba1
# CondB
22/336 #0.06547619 subj 5f05ece827813a0da11cd397 --- Over 5% of data missing...
16/336 #0.04761905 subj 5e8cd8011a88ad08f181eb96
16/336 #0.04761905 subj 5e746f7a64051f3d90ff620f
7/336 #0.02083333 subj 5f4a4da1575d605c43bef871
#5/336 #0.01488095 subj 5f2d6b6fb5f3f201942d3eb6 --- removed
# CondC
13/336 #0.03869048 subj 5f2d5ea5a073960008ed5894
7/336 #0.02083333 subj 5ec44da606cb7931f04a35e9
6/336 #0.01785714 subj 5f12fa2b44b6a42a83519a79
5/336 #0.01488095 subj 5e8ab4b84d3d6775b807e9ba
4/336 #0.01190476 subj 5765b92cf96b100001f64c9a
# CondD
#119/336 #0.3541667 subj 5c5067d2c879e900017cde4c --- removed
#117/336 #0.3482143 subj 5eb2f7d3c2c3c90c11cbb1cf --- removed
#52/336 #0.1547619 subj 5e57501c1893b21054dbf993 --- removed
22/336 #0.06547619 subj 5ebac5b2ae6d130365b0ed08 --- Over 5% of data missing...
8/336 #0.02380952 subj 5d485db699f83a00012bc391
7/336 #0.02083333 subj 5c5067d2c879e900017cde4c, 5cdf2adac194e800187ad97a
4/336 #0.01190476 subj 5bbbca5d63ea8a00019bd7fa, 5e6cbcbc4b1e3727b86b6327, 5f19fa99263a8c0c22573e08
# CondE
#304/336 #0.9047619 subj 5e267b418e5af58d037f70f0 --- removed
#230/336 #0.6845238 subj 5c659f77d0fb37000172f5dd --- removed
#215/336 #0.639881 subj 5d2c29f9917e53001a3a5c9d --- removed
#157/336 #0.4672619 subj 5c961ee66357fb0001ef4b4a --- removed
31/336 #0.0922619 subj 5c7d960e461e90000179f6d6 ...
18/336 #0.05357143 subj 5e72318254cb2015a07ef417
17/336 #0.05059524 subj 5e521b2996dbff237ca41f84
11/336 #0.0327381 subj 5f285f04a241c5166e68dc95
5/336 #0.01488095 subj 5ef4dde17a4f7c0f80a0c593
4/336 #0.01190476 subj 5c8f224c9d2bff0001867ce2
# CondF
#307/336 #0.9136905 subj 5f5cc6062074a44b9880e431 --- removed
#54/336 #0.1607143 subj 5aa02ab689de8200013f3dab --- removed (for timing out; took 137.58728 min to complete AXB)
16/336 #0.04761905 subj 5e954bac8099ec8c18ba2b69
7/336 #0.02083333 subj 5bdf513338109a0001f474c8
Go back to top and remove outlier participants, if necessary. Then rerun everything up to this point.
# Remove outliers
condata.test.final <- setdiff(condata.test.check, condata.test.outliers)
# Group data by StimType (i.e. Speaker-SpeakerGuise-Vowel-SentNum)
condata.test.final <- condata.test.final %>%
unite(token, speaker, sentNum, remove=FALSE) %>%
mutate(word = case_when(token == "S3_21" ~ "bright",
token == "S3_22" ~ "device",
token == "S3_30" ~ "twice",
token == "S9_23" ~ "goodnight",
token == "S9_25" ~ "invite",
token == "S9_29" ~ "sight",
token == "S3_18" ~ "slouch",
token == "S3_19" ~ "without",
token == "S3_20" ~ "workout",
token == "S9_11" ~ "checkout",
token == "S9_18" ~ "sprouts",
token == "S9_20" ~ "workout")) %>%
mutate(item = paste0(speaker,"_",word)) %>%
mutate(respRS = case_when(raised_response == "true" ~ 1,
raised_response == "false" ~ 0)) %>%
unite(stimType_byword, speaker, speakerGuise, vowel, word, remove=FALSE) %>%
unite(stimType_byvowel, speaker, speakerGuise, vowel, remove=FALSE) %>%
mutate(sentNum = as.factor(sentNum)) %>%
mutate(step = (step-5)) %>%
select(participantId, guiseCombination, step, vowel, speakerGuise, speaker, word, speakerOrder, respRS, stimType_byword, stimType_byvowel, everything())
# Summary
summary(condata.test.final)
participantId guiseCombination step vowel speakerGuise speaker word speakerOrder respRS stimType_byword
5610c4ea7ffc8a0005811504: 336 baseline:13313 Min. :-3e+00 AI:19996 BL:13313 S3:19978 Length:39970 S3-S9:19984 Min. :0.0000 Length:39970
57509d9b363e77000695620b: 336 match :13329 1st Qu.:-2e+00 AU:19974 CN:13332 S9:19992 Class :character S9-S3:19986 1st Qu.:0.0000 Class :character
57901c44900cc80001d2e9c7: 336 mismatch:13328 Median : 0e+00 MI:13325 Mode :character Median :1.0000 Mode :character
596016f51d5c3e00012f03c3: 336 Mean : 5e-05 Mean :0.5081
59a88f5b321f870001d16d68: 336 3rd Qu.: 2e+00 3rd Qu.:1.0000
5a8e2e8faa46dd00016bfb5a: 336 Max. : 3e+00 Max. :1.0000
(Other) :37954
stimType_byvowel conditionId trial_role time_elapsed time_elapsed_sec rt rt_sec raised_response token sentNum
Length:39970 condA:6681 test:39970 Min. : 22385 Min. : 22.39 Min. : 1643 Min. : 1.643 false:19663 Length:39970 20 : 6661
Class :character condB:6648 1st Qu.: 761758 1st Qu.: 761.76 1st Qu.: 3269 1st Qu.: 3.269 true :20307 Class :character 18 : 6657
Mode :character condC:6675 Median :1262205 Median :1262.20 Median : 3592 Median : 3.592 Mode :character 21 : 3336
condD:6653 Mean :1356742 Mean :1356.74 Mean : 3804 Mean : 3.804 23 : 3336
condE:6628 3rd Qu.:1784197 3rd Qu.:1784.20 3rd Qu.: 4040 3rd Qu.: 4.040 25 : 3335
condF:6685 Max. :6413055 Max. :6413.06 Max. :12850 Max. :12.850 29 : 3335
(Other):13310
order speakerIdentity guiseName raised_answer key_response starttime tonesCorrect dur_sec dur_ms quart_dur_sec quart_dur_ms
axb:19984 CN:19992 S3-BL:6675 Min. :0.0 Min. :0.0000 1599321623488: 336 Min. :0.0 Min. :1.799 Min. :1799 Min. :1.349 Min. :1349
bxa:19986 MI:19978 S3-CN:6653 1st Qu.:0.0 1st Qu.:0.0000 1599321799102: 336 1st Qu.:3.0 1st Qu.:2.224 1st Qu.:2224 1st Qu.:1.668 1st Qu.:1668
S3-MI:6650 Median :1.0 Median :1.0000 1599322002660: 336 Median :6.0 Median :2.473 Median :2473 Median :1.854 Median :1854
S9-BL:6638 Mean :0.5 Mean :0.6016 1599322482285: 336 Mean :4.4 Mean :2.464 Mean :2464 Mean :1.848 Mean :1848
S9-CN:6679 3rd Qu.:1.0 3rd Qu.:1.0000 1599325250629: 336 3rd Qu.:6.0 3rd Qu.:2.677 3rd Qu.:2677 3rd Qu.:2.008 3rd Qu.:2008
S9-MI:6675 Max. :1.0 Max. :1.0000 1599327399382: 336 Max. :6.0 Max. :3.022 Max. :3022 Max. :2.266 Max. :2266
(Other) :37954
rt.outlier.lower rt.outlier.upper item
Mode :logical Mode :logical Length:39970
FALSE:39970 FALSE:39970 Class :character
Mode :character
# Print data
condata.test.final
# Write to file
write.csv(condata.test.final, 'data/axb_1a_exp_data.csv', row.names=F)
# Final kept data points
(datapoints.final <- nrow(condata.test.final))
[1] 39970
# Final removed data points
datapoints.og-datapoints.final
[1] 350
# Calculate percentage of data removed
(datapoints.og-datapoints.final)/datapoints.og # = 0.008852259 = 0.8% of the data were removed due to responses that were too quick (less than 3/4 of the time into the audio file, before the third token would have played) or too slow (over 10 sec after the end of the audio file, an arbitrarily chosen value that should be enough time if a participant were responding as quickly as possible)
[1] 0.008680556
# Get subj means per condition
subj.means <- condata.test.final %>% #filter(participantId!='5e42f74f5b772a18434cabf7') %>%
group_by(participantId, step, vowel, speaker, speakerGuise) %>%
summarise(mean.Prop = mean(respRS))
# Get group means and se per condition (by averaging speaker means)
condition.means <- subj.means %>%
group_by(step, vowel, speaker, speakerGuise) %>%
summarise(grandM.Prop = mean(mean.Prop), se = std.error(mean.Prop))
# Plot lineplot with error bars on step points
byGuise_prop_plot <- condition.means %>%
ggplot(aes(x = step, y = grandM.Prop)) +
geom_point(stat="identity", aes(colour = factor(speakerGuise)), cex=5) +
geom_line(aes(colour=factor(speakerGuise), linetype=factor(speakerGuise)), lwd=1) +
geom_errorbar(width = .25, aes(ymin = grandM.Prop-se, ymax = grandM.Prop+se,
colour = factor(speakerGuise))) +
facet_grid(speaker~vowel) +
labs(y = "Proportion 'raised' response", x = "Continuum Step (UR to RS)", color="Guise",
linetype = "Guise", title="Raising Perception: By Guise") +
coord_cartesian(ylim=c(0, 1)) +
scale_x_continuous(breaks = -3:3) +
scale_color_manual(values=ghibli_palette("PonyoMedium")[c(6,4,3)]) +
gg_theme()
byGuise_prop_plot
# Get subj means per condition
subj.means <- condata.test.final %>%
group_by(participantId, step, vowel, speaker, speakerGuise) %>%
summarise(mean.rt = mean(log(rt)))
# Get group means and se per condition (by averaging speaker means)
condition.means <- subj.means %>%
group_by(step, vowel, speaker, speakerGuise) %>%
summarise(grandM.rt = mean(mean.rt), se = std.error(mean.rt))
# Plot lineplot with error bars on step points
byGuise_rt_plot <- condition.means %>%
ggplot(aes(x = step, y = grandM.rt)) +
geom_point(stat="identity", aes(colour = factor(speakerGuise)), cex=5) +
geom_line(aes(colour=factor(speakerGuise), linetype=factor(speakerGuise)), lwd=1) +
geom_errorbar(width = .25, aes(ymin = grandM.rt-se, ymax = grandM.rt+se, colour = factor(speakerGuise))) +
facet_grid(speaker~vowel) +
labs(y = "Log Response Time", x = "Continuum Step (UR to RS)", color="Guise",
linetype = "Guise", title="Reaction Time: By Guise") +
scale_color_manual(values=ghibli_palette("PonyoMedium")[c(6,4,3)])+
gg_theme()
byGuise_rt_plot
# Get subj means per condition
subj.means <- condata.test.final %>% filter(speaker=="S3") %>%
group_by(participantId, step, vowel, speakerGuise, speaker, word) %>%
summarise(mean.Prop = mean(respRS))
# Get group means and se per condition (by averaging speaker means)
condition.means <- subj.means %>%
group_by(step, vowel, speakerGuise, speaker, word) %>%
summarise(grandM.Prop = mean(mean.Prop), se = std.error(mean.Prop))
# AI
byWord_prop_plot <- condition.means %>% filter(vowel=="AI") %>%
ggplot(aes(x = step, y = grandM.Prop)) +
geom_point(stat="identity", aes(colour = factor(speakerGuise)), cex=5, alpha=0.75) +
geom_line(aes(colour=factor(speakerGuise), linetype=factor(word)), lwd=1) +
geom_errorbar(width = .25, aes(ymin = grandM.Prop-se, ymax = grandM.Prop+se, colour = factor(speakerGuise))) +
facet_grid(speaker~word) +
labs(y = "Proportion 'raised' response", x = "Continuum Step (UR to RS)", color="Guise",
linetype = "Word", title="AI Raising Perception: By Word") +
coord_cartesian(ylim=c(0, 1)) +
scale_x_continuous(breaks = -3:3) +
scale_color_manual(values=ghibli_palette("PonyoMedium")[c(6,4,3)])+
gg_theme()
byWord_prop_plot
# AU
byWord_prop_plot <- condition.means %>% filter(vowel=="AU") %>%
ggplot(aes(x = step, y = grandM.Prop)) +
geom_point(stat="identity", aes(colour = factor(speakerGuise)), cex=5, alpha=0.75) +
geom_line(aes(colour=factor(speakerGuise), linetype=factor(word)), lwd=1) +
geom_errorbar(width = .25, aes(ymin = grandM.Prop-se, ymax = grandM.Prop+se, colour = factor(speakerGuise))) +
facet_grid(speaker~word) +
labs(y = "Proportion 'raised' response", x = "Continuum Step (UR to RS)", color="Guise",
linetype = "Word", title="AU Raising Perception: By Word") +
coord_cartesian(ylim=c(0, 1)) +
scale_x_continuous(breaks = -3:3) +
scale_color_manual(values=ghibli_palette("PonyoMedium")[c(6,4,3)])+
gg_theme()
byWord_prop_plot
(adapted from CantoMergers project)
The Qualtrics output includes a text response file and a numerical response file. Because I want to use text for some questions (e.g. IK2, the word selection question) but numbers for other questions (e.g. familiarity scale questions), I need to work with both.
I have downloaded the files and renamed them as ’_text’ and ’_num" files. Specifically, file names have been renamed from the original Qualtrics download name to final_lbq_num.csv and final_lbq_text.csv.
# List results files per subject
There were 29 warnings (use warnings() to see them)
filelist <- list.files(path="./data/axb_1a/questionnaire/", pattern=".csv",full.names=TRUE)
# Read and Concatenate results
# (1) Just read and concat
#condata.read <- do.call(rbind, lapply(filelist, read.csv))
# (2) Read, concat and extract part of filename
quesdata.read <- do.call(rbind, lapply(filelist, function(x) cbind(read.csv(x), dataFormat=strsplit(gsub(".csv","",x), "_")[[1]][4])))
# Check that newly extracted columns are correct (`num` or `text`)
# quesdata.read$dataFormat
# colnames(quesdata.read)
Remove the unnecessarily columns and rows. Also extract the question text while we’re at it (before removing the rows) so that we can refer to the questions as necessary, but they won’t be in the data to be analysed.
# Remove metadata columns (first several)
quesdata <- quesdata.read %>%
select(-c(StartDate, EndDate, Status, IPAddress, Progress, Duration..in.seconds., Finished, RecordedDate, ResponseId, RecipientLastName, RecipientFirstName, RecipientEmail, ExternalReference, LocationLatitude, LocationLongitude, DistributionChannel, UserLanguage, PROLIFIC_PID))
# Question reference (if want to look back at question text)
questions <- quesdata %>% slice(1)
# Remove unecessary question header and test data rows + add/fix relevant info + remove data from dropped subjects
quesdata <- quesdata %>%
# Remove irrelevant rows and removed data
subset(subjID != "Please check that your Prolific ID is correct, then press the 'next' button to continue with the survey." & subjID !="{\"ImportId\":\"QID78_TEXT\"}") %>%
subset(subjID != "preview1") %>%
# Fix incorrect subject numbers
mutate(subjID = gsub("5a5a507d76d1c60001ab2ac71c60001ab2ac7", "5a5a507d76d1c60001ab2ac7", subjID)) %>%
mutate(subjID = gsub("5f474877452857130314c9d11", "5f474877452857130314c9d1", subjID)) %>%
droplevels() %>%
# Merge with perception data (for convenience, uses the minimal dataframe `condata.tones`)
## Adds subject and condition info + automatically drops subjects that were screened out based on perception experiment performance/returns
rename(participantId = subjID) %>%
merge(., condata.tones) %>%
mutate(guiseCombination = case_when(conditionId == "condA" | conditionId == "condB" ~ "match",
conditionId == "condC" | conditionId == "condD" ~ "mismatch",
conditionId == "condE" | conditionId == "condF" ~ "baseline")) %>%
mutate(speakerOrder = case_when(conditionId == "condA" | conditionId == "condC" | conditionId == "condE" ~ "S3-S9",
conditionId == "condB" | conditionId == "condD" | conditionId == "condF" ~ "S9-S3")) %>%
droplevels()
Here’s the table of the question tags, numbers and text. This interactive table allows for sorting and searching! We can use this to check the exact wording of the questions—all of them as they were shown to the participant.
# Print questions for reference
DT::datatable(questions,
options=list(scrollX = TRUE,
autoWidth = TRUE,
columnDefs = list(list(width = '200px', targets = "_all"))))
# Subset to demographic question columns
quesdata.demo <- quesdata %>% filter(dataFormat == "text") %>%
select(conditionId, participantId, guiseCombination, speakerOrder,
Age, Gender, Ethnicity, SpHDisorder, SpHDisorder_2_TEXT, Degree, LingExp, FirstLang,
Loc1_1:Loc6_4) %>%
# Fix spelling errors and variation on demographic questions
mutate_at(vars(-conditionId, -speakerOrder),tolower) %>%
mutate_if(is.character, str_trim)
# Check data
quesdata.demo
# Summary
# summary(quesdata.demo)
# Quick check via table
ques.demo.sum <- quesdata.demo %>% select(conditionId, Age, Gender, Ethnicity, Loc1_4)
with(quesdata.demo, unique(Gender))
[1] "female" "male" "woman" "femal" "female." "non binary" "cis female" "non-binary"
with(quesdata.demo, unique(Loc1_4))
[1] "mi" "michigan" "mi/usa" "" "mi usa"
[6] "michigan/usa" "georgia" "illinoise" "pa" "mi united states"
[11] "il" "united states" "michigan, usa" "mi, usa"
with(quesdata.demo, unique(Ethnicity))
[1] "white" "caucasian, white" "black" "american"
[5] "caucasian" "write" "dutch american" "hispanic"
[9] "white." "biracial black/white" "asian (korean)" "multiracial"
[13] "african american" "cauacasian" "european" "asian"
[17] "white/caucasian" "filipino/white" "middle eastern" "mixed"
[21] "latino" "am. indian" "white causasian" "multi-racial"
quesdata.demo <- quesdata.demo %>%
mutate(Gender = mgsub(Gender, c("woman.*|.*femal.*", "male|cis male", "non-binary|non binary|nonbinary"), c("f", "m", "nb"))) %>%
mutate_at(vars(starts_with("Loc") & ends_with("_4")), ~ mgsub(.x, c("michig.*|mi[/, ].*"), c("mi"))) %>%
mutate(Ethnicity = mgsub(Ethnicity, c("caucasian|caucasion|cauacasian|american|write|european|dutch.*","african american|african-american", "middle east.*", "latino", "biracial.*|multi.*|mixed"), c("white", "black", "middle-eastern", "hispanic", "multiracial"))) %>%
mutate(Ethnicity = mgsub(Ethnicity, c("filipino/white","white/hispanic", "am. indian"), c("multiracial","multiracial", "native"), fixed=TRUE)) %>%
mutate(Ethnicity = mgsub(Ethnicity, c(".*white.*", "black.*", ".*asian.*"), c("white", "black", "asian"))) %>%
# Change vector classes from character class
mutate_at(vars(Age), as.numeric) %>%
# Select
select(conditionId, participantId, everything())
quesdata.demo
NA
# Ethnicity Counts
quesdata.demo %>% group_by(Ethnicity) %>% count()
quesdata.demo %>% group_by(Ethnicity, conditionId) %>% count() %>% pivot_wider(Ethnicity, names_from = conditionId, values_from=n)
NA
# Gender Counts
quesdata.demo %>% group_by(Gender) %>% count()
quesdata.demo %>% group_by(Gender, conditionId) %>% count() %>% pivot_wider(Gender, names_from = conditionId, values_from=n)
NA
# Age
quesdata.demo %>% summarise(n=length(conditionId), minAge = min(Age), maxAge = max(Age), meanAge = mean(Age), sdAge = sd(Age))
quesdata.demo %>% group_by(conditionId) %>%
summarise(n=length(conditionId), minAge = min(Age), maxAge = max(Age), meanAge = mean(Age), sdAge = sd(Age)) %>% ungroup()
NA
quesdata.loc <- quesdata.demo %>% select(participantId, starts_with("Loc"))
quesdata.loc
loc.code <- data.frame(Loc = unique((quesdata.loc$Loc1_3))) %>%
rbind(data.frame(Loc = unique((quesdata.loc$Loc2_3)))) %>%
rbind(data.frame(Loc = unique((quesdata.loc$Loc3_3)))) %>%
rbind(data.frame(Loc = unique((quesdata.loc$Loc4_3)))) %>%
rbind(data.frame(Loc = unique((quesdata.loc$Loc5_3)))) %>%
rbind(data.frame(Loc = unique((quesdata.loc$Loc6_3)))) %>%
unique()
loc.code
# Subset data to text format
quesdata.text <- quesdata %>% filter(dataFormat == "text") %>%
select(participantId, conditionId, guiseCombination, speakerOrder, everything()) %>%
select(-c(Age, Gender, Ethnicity, SpHDisorder, SpHDisorder_2_TEXT, Degree, LingExp, FirstLang,
Loc1_1:Loc6_4, expPurpose)) %>%
mutate_if(is.character, as.factor)
quesdata.text.sub <- quesdata.text %>%
select(-starts_with("Q"), -SC0, -tonesCorrect) %>%
droplevels()
quesdata.text.sub
IK2 (Implicit Knowledge Q2) refers to the question where subjects select all the words they think Canadians and Michiganders would pronounce differently (“Which of the following words, if any, do you think would be pronounced differently by someone from Canada, as opposed to someone from Michigan? Please select all that apply.”). The question includes 30 words, 5 of which are target /au/ words and 5 of which are target /ai/ words.
We want to go from the raw data, the selected words, to the number of target words (/au/ and /ai/ raising words) selected. The response format is words separated by commas in one cell. So, we need to separate the strings into separate words, check whether each target word occured, then tabulate the scores. A simple search for word strings won’t work, because some words (e.g. out) are substrings of other words (e.g. about).
My solution was to first separate the single column by commas into multiple columns, one per word. To check whether the word was a response, I implement a count of 1 in a new column if a search function finds the target string in a row. This is done for each target word. Finally, I sum the count columns for /ai/ and /au/ separately to get a score out of 5.
# IK2: Can Word Selection question
# Select only IK2 question + create copied column of Words (for next step)
ik2 <- select(quesdata.text.sub, participantId, IK2.CanWords)
ik2 <- mutate(ik2, Words = IK2.CanWords)
# Separate Words into columns (by comma) + create new columns of word in list per row
# Number of columns must be the same for every row, so find the max number of words selected by any subject (23 here)
# Each subject will have 23 columns, one word per column (Word_1, Word_2...) until no more words (NA if no word)
ik2 <- separate(ik2, "Words", paste("Word", 1:30, sep="_"), sep=",", extra="drop")
# Identify target words in each subjects' responses
# Count 1 if word exists in row; 0 if none
# /au/ targets
ik2 <- mutate(ik2, IK2.au.out = as.integer(apply(ik2, 1, function(x) any(x %in% "out"))),
IK2.au.about = as.integer(grepl('about',IK2.CanWords)),
IK2.au.devout = as.integer(grepl('devout',IK2.CanWords)),
IK2.au.house = as.integer(grepl('house',IK2.CanWords)),
IK2.au.pouch = as.integer(grepl('pouch',IK2.CanWords)))
# /ai/ targets
ik2 <- mutate(ik2, IK2.ai.like = as.integer(apply(ik2, 1, function(x) any(x %in% "like"))),
IK2.ai.right = as.integer(apply(ik2, 1, function(x) any(x %in% "right"))),
IK2.ai.might = as.integer(apply(ik2, 1, function(x) any(x %in% "might"))),
IK2.ai.unite = as.integer(grepl('unite',IK2.CanWords)),
IK2.ai.ripe = as.integer(apply(ik2, 1, function(x) any(x %in% "ripe"))))
# Sum of targets selected for /au/ and /ai/
ik2 <- mutate(ik2, IK2.au = rowSums(select(ik2, IK2.au.out:IK2.au.pouch)),
IK2.ai = rowSums(select(ik2, IK2.ai.like:IK2.ai.ripe)))
# Here are two versions of the data
# ...with score for each target word + sum of /au/ and /ai/ targets selected
ik2.values <- select(ik2, participantId, IK2.au, IK2.ai, IK2.au.out:IK2.ai.ripe)
# ...with only sum of /au/ and /ai/ targets selected
ik2.sum <- select(ik2, participantId, IK2.au:IK2.ai)
A few cases where participants do not respond to a question because the answer is ‘no’ results in an ‘NA’ entry. These ’NA’s for specific columns are adjusted “manually” to the correct value.
# Subset data to num format
quesdata.num <- quesdata %>% subset(dataFormat == "num") %>%
select(participantId, conditionId, guiseCombination, speakerOrder, everything()) %>%
select(-c(Age, Gender, Ethnicity, SpHDisorder, SpHDisorder_2_TEXT, Degree, LingExp, FirstLang,
Loc1_1:Loc6_4, expPurpose)) %>%
rename(EQ.raws = SC0) %>%
mutate_at(vars(participantId:speakerOrder), as.factor) %>%
mutate_if(~ all(grepl('^\\d+$', .x)), as.numeric)
## Further adjustments
quesdata.num.sub <- quesdata.num %>%
# Convert columns to numeric, leaving non-numeric columns as NA
mutate_if(is.character, as.numeric) %>%
# Remove columns that are all NA (specifically, if Sum of that column's NAs is NOT equal to the number of rows, select )
select_if(colSums(is.na(.)) != nrow(.)) %>%
# Remove columns of certain types of questions or specific question number
select(-starts_with("Q"), -starts_with("Lang"), -starts_with("IK2"), -starts_with("ME1"), -ends_with("TEXT")) %>%
# Adjust values of NA (for cases where they can be interpreted as a 'no' or 'never')
mutate(Travel.NE_Visits = coalesce(Travel.NE_Visits, 1), Travel.S_Visits = coalesce(Travel.S_Visits, 1),
Travel.W_Visits = coalesce(Travel.W_Visits, 1), Travel.Can_Visits = coalesce(Travel.Can_Visits, 1),
Travel.NE_Time = coalesce(Travel.NE_Time, 1), Travel.S_Time = coalesce(Travel.S_Time, 1),
Travel.W_Time = coalesce(Travel.W_Time, 1), Travel.Can_Time = coalesce(Travel.Can_Time, 1),
PE1.Relatives = coalesce(PE1.Relatives, 2), PE1.CloseFamFriends = coalesce(PE1.CloseFamFriends, 2),
EK3.CanAI.Diff = coalesce(EK3.CanAI.Diff, 2), EK4.CanAU.Diff = coalesce(EK4.CanAU.Diff, 2),
ME3.Sources.OtherMedia_1 = coalesce(ME3.Sources.OtherMedia_1, 1),
ME3.Sources.Other_1 = coalesce(ME3.Sources.Other_1, 1),
ME3.Sources.News_1 = coalesce(ME3.Sources.News_1, 1)) %>%
droplevels()
Problem with `mutate()` input `Lang1_1`.
[34mℹ[39m NAs introduced by coercion
[34mℹ[39m Input `Lang1_1` is `.Primitive("as.double")(Lang1_1)`.NAs introduced by coercionProblem with `mutate()` input `Lang1_2`.
[34mℹ[39m NAs introduced by coercion
[34mℹ[39m Input `Lang1_2` is `.Primitive("as.double")(Lang1_2)`.NAs introduced by coercionProblem with `mutate()` input `Lang2_1`.
[34mℹ[39m NAs introduced by coercion
[34mℹ[39m Input `Lang2_1` is `.Primitive("as.double")(Lang2_1)`.NAs introduced by coercionProblem with `mutate()` input `Lang2_2`.
[34mℹ[39m NAs introduced by coercion
[34mℹ[39m Input `Lang2_2` is `.Primitive("as.double")(Lang2_2)`.NAs introduced by coercionProblem with `mutate()` input `Lang3_1`.
[34mℹ[39m NAs introduced by coercion
[34mℹ[39m Input `Lang3_1` is `.Primitive("as.double")(Lang3_1)`.NAs introduced by coercionProblem with `mutate()` input `Lang3_2`.
[34mℹ[39m NAs introduced by coercion
[34mℹ[39m Input `Lang3_2` is `.Primitive("as.double")(Lang3_2)`.NAs introduced by coercionProblem with `mutate()` input `Lang4_1`.
[34mℹ[39m NAs introduced by coercion
[34mℹ[39m Input `Lang4_1` is `.Primitive("as.double")(Lang4_1)`.NAs introduced by coercionProblem with `mutate()` input `Lang4_2`.
[34mℹ[39m NAs introduced by coercion
[34mℹ[39m Input `Lang4_2` is `.Primitive("as.double")(Lang4_2)`.NAs introduced by coercionProblem with `mutate()` input `Lang5_1`.
[34mℹ[39m NAs introduced by coercion
[34mℹ[39m Input `Lang5_1` is `.Primitive("as.double")(Lang5_1)`.NAs introduced by coercionProblem with `mutate()` input `Lang5_2`.
[34mℹ[39m NAs introduced by coercion
[34mℹ[39m Input `Lang5_2` is `.Primitive("as.double")(Lang5_2)`.NAs introduced by coercionProblem with `mutate()` input `Lang6_1`.
[34mℹ[39m NAs introduced by coercion
[34mℹ[39m Input `Lang6_1` is `.Primitive("as.double")(Lang6_1)`.NAs introduced by coercionProblem with `mutate()` input `Lang6_2`.
[34mℹ[39m NAs introduced by coercion
[34mℹ[39m Input `Lang6_2` is `.Primitive("as.double")(Lang6_2)`.NAs introduced by coercionProblem with `mutate()` input `Lang7_1`.
[34mℹ[39m NAs introduced by coercion
[34mℹ[39m Input `Lang7_1` is `.Primitive("as.double")(Lang7_1)`.NAs introduced by coercionProblem with `mutate()` input `Lang7_2`.
[34mℹ[39m NAs introduced by coercion
[34mℹ[39m Input `Lang7_2` is `.Primitive("as.double")(Lang7_2)`.NAs introduced by coercionProblem with `mutate()` input `LangSpeakFreq`.
[34mℹ[39m NAs introduced by coercion
[34mℹ[39m Input `LangSpeakFreq` is `.Primitive("as.double")(LangSpeakFreq)`.NAs introduced by coercionProblem with `mutate()` input `LangHearFreq`.
[34mℹ[39m NAs introduced by coercion
[34mℹ[39m Input `LangHearFreq` is `.Primitive("as.double")(LangHearFreq)`.NAs introduced by coercionProblem with `mutate()` input `SK1.WhereStandard`.
[34mℹ[39m NAs introduced by coercion
[34mℹ[39m Input `SK1.WhereStandard` is `.Primitive("as.double")(SK1.WhereStandard)`.NAs introduced by coercionProblem with `mutate()` input `IK2.CanWords`.
[34mℹ[39m NAs introduced by coercion
[34mℹ[39m Input `IK2.CanWords` is `.Primitive("as.double")(IK2.CanWords)`.NAs introduced by coercionProblem with `mutate()` input `EK1.CanSpeak.Diff`.
[34mℹ[39m NAs introduced by coercion
[34mℹ[39m Input `EK1.CanSpeak.Diff` is `.Primitive("as.double")(EK1.CanSpeak.Diff)`.NAs introduced by coercionProblem with `mutate()` input `EK2.CanPron.Diff`.
[34mℹ[39m NAs introduced by coercion
[34mℹ[39m Input `EK2.CanPron.Diff` is `.Primitive("as.double")(EK2.CanPron.Diff)`.NAs introduced by coercionProblem with `mutate()` input `EK3.CanAI.Diff_1_TEXT`.
[34mℹ[39m NAs introduced by coercion
[34mℹ[39m Input `EK3.CanAI.Diff_1_TEXT` is `.Primitive("as.double")(EK3.CanAI.Diff_1_TEXT)`.NAs introduced by coercionProblem with `mutate()` input `EK4.CanAU.Diff_1_TEXT`.
[34mℹ[39m NAs introduced by coercion
[34mℹ[39m Input `EK4.CanAU.Diff_1_TEXT` is `.Primitive("as.double")(EK4.CanAU.Diff_1_TEXT)`.NAs introduced by coercionProblem with `mutate()` input `PE1.Relatives_1_TEXT`.
[34mℹ[39m NAs introduced by coercion
[34mℹ[39m Input `PE1.Relatives_1_TEXT` is `.Primitive("as.double")(PE1.Relatives_1_TEXT)`.NAs introduced by coercionProblem with `mutate()` input `PE1.CloseFamFriends_1_TEXT`.
[34mℹ[39m NAs introduced by coercion
[34mℹ[39m Input `PE1.CloseFamFriends_1_TEXT` is `.Primitive("as.double")(PE1.CloseFamFriends_1_TEXT)`.NAs introduced by coercionProblem with `mutate()` input `ME1.TVShows`.
[34mℹ[39m NAs introduced by coercion
[34mℹ[39m Input `ME1.TVShows` is `.Primitive("as.double")(ME1.TVShows)`.NAs introduced by coercionProblem with `mutate()` input `ME3.Sources.OtherMedia_1_TEXT`.
[34mℹ[39m NAs introduced by coercion
[34mℹ[39m Input `ME3.Sources.OtherMedia_1_TEXT` is `.Primitive("as.double")(ME3.Sources.OtherMedia_1_TEXT)`.NAs introduced by coercionProblem with `mutate()` input `ME3.Sources.Other_1_TEXT`.
[34mℹ[39m NAs introduced by coercion
[34mℹ[39m Input `ME3.Sources.Other_1_TEXT` is `.Primitive("as.double")(ME3.Sources.Other_1_TEXT)`.NAs introduced by coercionProblem with `mutate()` input `dataFormat`.
[34mℹ[39m NAs introduced by coercion
[34mℹ[39m Input `dataFormat` is `.Primitive("as.double")(dataFormat)`.NAs introduced by coercion
quesdata.num.sub
# Select and Merge only relavant numerical columns of data for PCA analysis
quesdata.clean <- quesdata.demo %>%
select(participantId, conditionId, guiseCombination, speakerOrder, Age, Gender, Ethnicity) %>%
merge(., ik2.values) %>%
merge(., quesdata.num.sub) %>%
droplevels()
# quesdata.final
quesdata.clean
# Check correlations, which motivate the use of PCA to reduce dimensionality
with(quesdata.clean, cor.test(IK2.ai, IK2.au))
Pearson's product-moment correlation
data: IK2.ai and IK2.au
t = 0.10684, df = 118, p-value = 0.9151
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.1697059 0.1887432
sample estimates:
cor
0.009834557
# Test correlations of similar questions
with(quesdata.clean, cor.test(Travel.Can_Visits, Travel.Can_Time))
Pearson's product-moment correlation
data: Travel.Can_Visits and Travel.Can_Time
t = 4.8225, df = 118, p-value = 4.271e-06
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.2442879 0.5453413
sample estimates:
cor
0.4057623
with(quesdata.clean, cor.test(SE1.Fam.oot_1, SE2.Freq.Overall_1))
Pearson's product-moment correlation
data: SE1.Fam.oot_1 and SE2.Freq.Overall_1
t = 6.3242, df = 118, p-value = 4.725e-09
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.3559947 0.6259258
sample estimates:
cor
0.5031317
with(quesdata.clean, cor.test(SE2.Freq.Overall_1, SE2.Freq.Recent_1))
Pearson's product-moment correlation
data: SE2.Freq.Overall_1 and SE2.Freq.Recent_1
t = 15.478, df = 118, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.7492182 0.8701187
sample estimates:
cor
0.8185375
with(quesdata.clean, cor.test(SE2.Freq.Overall_1, SE2.Freq.Child_1))
Pearson's product-moment correlation
data: SE2.Freq.Overall_1 and SE2.Freq.Child_1
t = 11.606, df = 118, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.6337845 0.8041026
sample estimates:
cor
0.7300876
with(quesdata.clean, cor.test(SE2.Freq.Recent_1, SE2.Freq.Child_1))
Pearson's product-moment correlation
data: SE2.Freq.Recent_1 and SE2.Freq.Child_1
t = 6.2574, df = 118, p-value = 6.521e-09
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.3513380 0.6226779
sample estimates:
cor
0.499146
Packages required for this PCA: * PCA command from FactoMineR library (see index for more info) * paran command from paran library * Varimax command from GPArotations library (https://stats.stackexchange.com/questions/59213/how-to-compute-varimax-rotated-principal-components-in-r)
Packages for visualization of PCA * fviz_pca_ind and fviz_pca_biplot from factoextra
# (0) Select data for PCA — only numerical columns
# names(quesdata.clean)
# Medium trimmed set — removes general Canadian stereotype experience/knowledge, imitation, Sources of CE, hockey
quesdata.pca.med <- select(quesdata.clean,
IK2.au, IK2.ai,
SE1.Fam.oot_1, SE2.Freq.Recent_1:SE2.Freq.Overall_1,
PE2.CanSpeakFreq.Recent_1:PE2.CanSpeakFreq.Overall_1,
ME4.CanHearFreq.Recent_1:ME4.CanHearFreq.Overall_1)
quesdata.pca.med
NA
## (1) Run Parallel Analysis with `paran`
# Standard way to decide on the number of factors or components needed in an FA or PCA.
# Prints out a scree plot as well, with the randomized line + unadjusted line
paran(quesdata.pca.med,
graph = TRUE, color = TRUE,
col = c("black", "red", "blue"), lty = c(1, 2, 3), lwd = 1, legend = TRUE,
file = "", width = 640, height = 640, grdevice = "png", seed = 0)
Using eigendecomposition of correlation matrix.
Computing: 10% 20% 30% 40% 50% 60% 70% 80% 90% 100%
Results of Horn's Parallel Analysis for component retention
360 iterations, using the mean estimate
--------------------------------------------------
Component Adjusted Unadjusted Estimated
Eigenvalue Eigenvalue Bias
--------------------------------------------------
1 4.971821 5.526365 0.554543
2 1.092396 1.490719 0.398322
--------------------------------------------------
Adjusted eigenvalues > 1 indicate dimensions to retain.
(2 components retained)
## (2) Run PCA with `FactoMineR`
# ncp = number of components; adjust after checking the parallel analysis output
# FactoMineR PCA Commands
#plbqPCA # lists commands
#plbqPCA$var # variables
#plbqPCA$ind # individuals
#plbqPCA$call # summary stats
lbqPCA <- PCA(quesdata.pca.med, scale.unit = T, ncp =2, graph=F)
## Relevant Raw PCA Output
# Eigenvalues & percent variance accounted for
(eigenvalues <- lbqPCA$eig)
eigenvalue percentage of variance cumulative percentage of variance
comp 1 5.52636515 46.0530429 46.05304
comp 2 1.49071930 12.4226609 58.47570
comp 3 1.27627799 10.6356499 69.11135
comp 4 0.98084691 8.1737243 77.28508
comp 5 0.91569803 7.6308169 84.91589
comp 6 0.54179449 4.5149541 89.43085
comp 7 0.49885076 4.1570897 93.58794
comp 8 0.29356278 2.4463565 96.03430
comp 9 0.18330456 1.5275380 97.56183
comp 10 0.16091525 1.3409604 98.90279
comp 11 0.08765570 0.7304641 99.63326
comp 12 0.04400908 0.3667424 100.00000
# Eigenvectors (=Factor matrix, factor score coefficients; sometimes called the factor, but NOT factor scores)
(eigenvectors <- lbqPCA$var$coord)
Dim.1 Dim.2
IK2.au 0.33652887 0.25988343
IK2.ai 0.07280517 -0.02741285
SE1.Fam.oot_1 0.51820257 0.49540103
SE2.Freq.Recent_1 0.65306541 0.57172271
SE2.Freq.Child_1 0.72740750 0.09144364
SE2.Freq.Overall_1 0.77628179 0.46920650
PE2.CanSpeakFreq.Recent_1 0.74762538 -0.20377204
PE2.CanSpeakFreq.Child_1 0.64620148 -0.55597789
PE2.CanSpeakFreq.Overall_1 0.81985725 -0.31236546
ME4.CanHearFreq.Recent_1 0.78228431 0.04491797
ME4.CanHearFreq.Child_1 0.75019073 -0.38985772
ME4.CanHearFreq.Overall_1 0.87040715 -0.13931773
# Factor loadings (eigenvectors scaled by the square root of their associated eigenvalues)
# Calculate factor loadings using the output eigenvectors and eigenvalues
rawLoadings <- sweep(lbqPCA$var$coord,2,sqrt(lbqPCA$eig[1:ncol(lbqPCA$var$coord),1]),FUN="/")
rawLoadings
Dim.1 Dim.2
IK2.au 0.14315369 0.21285343
IK2.ai 0.03097009 -0.02245207
SE1.Fam.oot_1 0.22043461 0.40575041
SE2.Freq.Recent_1 0.27780297 0.46826048
SE2.Freq.Child_1 0.30942684 0.07489548
SE2.Freq.Overall_1 0.33021714 0.38429619
PE2.CanSpeakFreq.Recent_1 0.31802719 -0.16689628
PE2.CanSpeakFreq.Child_1 0.27488318 -0.45536493
PE2.CanSpeakFreq.Overall_1 0.34875340 -0.25583801
ME4.CanHearFreq.Recent_1 0.33277051 0.03678935
ME4.CanHearFreq.Child_1 0.31911844 -0.31930683
ME4.CanHearFreq.Overall_1 0.37025648 -0.11410599
# Factor scores for each subject and dimension (also: Individual coordinate scores; principle coordinates)
rawScores <- lbqPCA$ind$coord
## (3) Conduct rotation on the PCA factor loadings with `GPArotation`
# Rotations are typically done on the retained component factor loadings, not on all components nor on the eigenvectors
# Performed for ease of interpretation, maximizing factor loadings
(rotLoadings <- Varimax(rawLoadings)$loadings)
Dim.1 Dim.2
IK2.au -0.01960821 0.255763715
IK2.ai 0.03821973 0.001579207
SE1.Fam.oot_1 -0.07853666 0.455034945
SE2.Freq.Recent_1 -0.07226760 0.539647817
SE2.Freq.Child_1 0.19638308 0.250575317
SE2.Freq.Overall_1 0.02090899 0.506250663
PE2.CanSpeakFreq.Recent_1 0.35301134 0.066171395
PE2.CanSpeakFreq.Child_1 0.49796921 -0.186934875
PE2.CanSpeakFreq.Overall_1 0.43225467 0.015424708
ME4.CanHearFreq.Recent_1 0.23832181 0.235143326
ME4.CanHearFreq.Child_1 0.44834243 -0.052749372
ME4.CanHearFreq.Overall_1 0.36127275 0.139971547
# Recover Rotation matrix from loadings
# Because the rotLoadings are calculated from rawLoadings %*% rotMatrix, can recover rotMatrix by rotLoadings "divided" by rawLoadings, which in matrix multiplication is multiplying by the inverse (transpose)
# Note: For some reason, can't call Varimax(rawLoadings)$rotmat (just get NULL); this recreates the same matrix from Varimax(rawLoadings)
(rotMatrixL <- t(rawLoadings) %*% rotLoadings)
Dim.1 Dim.2
Dim.1 0.7847043 0.6198703
Dim.2 -0.6198703 0.7847043
# Calculate rotated factor scores
# The formula simply multiplies the normalized variable scores with the rotation matrix to get rotated factor scores
# First, z-score the raw scores using base R scale()
# Then, matrix multiply the matrix of zScores with the rotation matrix
# Result is a matrix with columns=components and rows=each subject
zScores <- scale(rawScores)
rotScores <- zScores %*% rotMatrixL
## (4) Data Visualization of Raw Scores with `factoextra`
# Plot individual factor scores
fviz_pca_ind(lbqPCA, col.ind = "#00AFBB", repel = TRUE)
# Biplot, including individual scores and factor vectors
fviz_pca_biplot(lbqPCA, label = "all", col.ind = "#00AFBB", col.var="black", ggtheme = theme_minimal())
## (5) Manual Plots of Rotated Scores with `ggplot`
## Create dataframes of the rotated factor loading and factor score matrices
# Convert rotated factor loadings matrix to data frame; add variable number
rotLoadingsData <- as.data.frame(rotLoadings)
rotLoadingsData <- mutate(rotLoadingsData, variable = row.names(rotLoadings))
rotLoadingsData <- mutate(rotLoadingsData, variable = factor(variable))
# Convert rotated factor score matrix to data frame; add subject number
rotScoreData <- as.data.frame(rotScores)
rotScoreData <- rotScoreData %>% mutate(subject = 1:nrow(.))
## Create base plots
# Loading plot
loadingplot <- rotLoadingsData %>% ggplot(aes(x=Dim.1, y=Dim.2))+
geom_segment(data=rotLoadingsData, mapping=aes(x=0, y=0, xend=Dim.1*4, yend=Dim.2*4), arrow=arrow(), size=0.5, color="black") +
geom_text(data=rotLoadingsData, aes(x=Dim.1*4, y=Dim.2*4, label=variable), color="red",check_overlap=T) +
scale_x_continuous(lim=c(-2.75, 2.75),breaks=seq(-3,3,1)) +
scale_y_continuous(lim=c(-3.5, 3.5),breaks=seq(-3,3,1)) +
geom_hline(yintercept=0, linetype="dashed") +
geom_vline(xintercept=0, linetype="dashed") +
labs(title="Variables - PCA", x="Dim 1 (36.4%)", y="Dim 2 (17.7%)") +
gg_theme()
loadingplot
# Scatter plot of Individual factor scores
dimplot = ggplot(rotScoreData, aes(x=Dim.1, y=Dim.2))+
geom_point(na.rm=TRUE, color="#00AFBB") +
geom_text(aes(label=subject),hjust=1.5,vjust=1.5, color="#00AFBB", check_overlap=T)+
scale_x_continuous(lim=c(-2.75, 2.75),breaks=seq(-3,3,1)) +
scale_y_continuous(lim=c(-3.5, 3.5),breaks=seq(-3,3,1)) +
geom_hline(yintercept=0, linetype="dashed") +
geom_vline(xintercept=0, linetype="dashed") +
labs(title="Individuals - PCA", x="Dim 1 (36.4%)", y="Dim 2 (17.7%)") +
gg_theme()
dimplot
## Merge loading and score plot = Biplot
# Biplot of factor loadings + ind factor scores
ggplot(rotScoreData, aes(x=Dim.1, y=Dim.2))+
geom_point(na.rm=TRUE, color="#00AFBB") +
geom_text(aes(label=subject),hjust=1.5,vjust=1.5, color="#00AFBB", check_overlap=T)+
# Overlay loading plot (i.e. arrows)
geom_segment(data=rotLoadingsData, mapping=aes(x=0, y=0, xend=Dim.1*4, yend=Dim.2*4), arrow=arrow(), size=0.5, color="black") +
geom_text(data=rotLoadingsData, aes(x=Dim.1*4.5, y=Dim.2*4.5, label=variable), color="red",check_overlap=T, nudge_y = 0)+
scale_x_continuous(lim=c(-2.75, 2.75),breaks=seq(-3,3,1)) +
scale_y_continuous(lim=c(-3.5, 3.5),breaks=seq(-3,3,1)) +
geom_hline(yintercept=0, linetype="dashed") +
geom_vline(xintercept=0, linetype="dashed") +
labs(title="Biplot - PCA", x="Dim 1 (36.4%)", y="Dim 2 (17.7%)") +
gg_theme()
## Interpret PCs (Dimensions) based on factor loadings
rotLoadings.df <- as.data.frame(rotLoadings) %>%
rownames_to_column(., "Variables") %>%
rename(., "PC1"= "Dim.1", "PC2" = "Dim.2")
rotLoadings.df
# PC1 Only Contributors
rotLoadings.df %>% filter(abs(PC1) > 0.2)
# PC2 Only Contributors
rotLoadings.df %>% filter(abs(PC2) > 0.2)
# Check for overlapping contributors
rotLoadings.df %>% filter(abs(PC2) > 0.2 & abs(PC1) > 0.2)
# Check for non-contributors
rotLoadings.df %>% filter(abs(PC2) < 0.2 & abs(PC1) < 0.2)
# For Merging: Convert rotated factor score matrix to data frame; add participantId (assuming order of input dataframe)
indPCAdata <- rotScoreData %>%
mutate(participantId = quesdata.clean$participantId) %>%
rename(CEscore = Dim.1) %>%
rename(SAscore = Dim.2)
# Merge participant data with PC scores
# Only select the main relevant scores
quesdata.final <- quesdata.clean %>%
merge(., indPCAdata) %>%
mutate(MSscore = scale(SK2.MichvStand)) %>%
mutate(EQscore = scale(EQ.raws)) %>%
mutate_at(vars(Age), as.numeric) %>%
mutate_if(is.character, as.factor) %>%
select(participantId, conditionId, guiseCombination, speakerOrder, Age, Gender, Ethnicity, CEscore, SAscore, MSscore, EQscore, EQ.raws, everything())
summary(quesdata.final)
participantId conditionId guiseCombination speakerOrder Age Gender
5610c4ea7ffc8a0005811504: 1 condA:20 baseline:40 S3-S9:60 Min. :18.00 f :68
57509d9b363e77000695620b: 1 condB:20 match :40 S9-S3:60 1st Qu.:23.00 m :50
5765b92cf96b100001f64c9a: 1 condC:20 mismatch:40 Median :28.50 nb: 2
57901c44900cc80001d2e9c7: 1 condD:20 Mean :33.63
59302683d0003800018f39b0: 1 condE:20 3rd Qu.:41.00
596016f51d5c3e00012f03c3: 1 condF:20 Max. :67.00
(Other) :114
Ethnicity CEscore SAscore MSscore.V1 EQscore.V1 EQ.raws
asian : 2 Min. :-1.9258 Min. :-2.8932 Min. :-3.0666851 Min. :-2.2722145 Min. :17.00
black : 9 1st Qu.:-0.7416 1st Qu.:-0.6085 1st Qu.:-0.3674024 1st Qu.:-0.6553636 1st Qu.:35.75
hispanic : 3 Median :-0.1493 Median : 0.1282 Median : 0.5323585 Median :-0.0301812 Median :43.00
middle-eastern: 1 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000000 Mean : 0.0000000 Mean :43.35
multiracial : 5 3rd Qu.: 0.7107 3rd Qu.: 0.6579 3rd Qu.: 0.5323585 3rd Qu.: 0.7459072 3rd Qu.:52.00
native : 1 Max. : 2.6149 Max. : 1.9687 Max. : 1.4321195 Max. : 2.5567803 Max. :73.00
white :99
IK2.au IK2.ai IK2.au.out IK2.au.about IK2.au.devout IK2.au.house IK2.au.pouch
Min. :0.00 Min. :0.0000 Min. :0.0000 Min. :0.00 Min. :0.000 Min. :0.0000 Min. :0.0
1st Qu.:2.00 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.00 1st Qu.:1.000 1st Qu.:0.0000 1st Qu.:0.0
Median :4.00 Median :0.0000 Median :1.0000 Median :1.00 Median :1.000 Median :1.0000 Median :1.0
Mean :3.25 Mean :0.8833 Mean :0.6417 Mean :0.65 Mean :0.775 Mean :0.5833 Mean :0.6
3rd Qu.:5.00 3rd Qu.:1.2500 3rd Qu.:1.0000 3rd Qu.:1.00 3rd Qu.:1.000 3rd Qu.:1.0000 3rd Qu.:1.0
Max. :5.00 Max. :5.0000 Max. :1.0000 Max. :1.00 Max. :1.000 Max. :1.0000 Max. :1.0
IK2.ai.like IK2.ai.right IK2.ai.might IK2.ai.unite IK2.ai.ripe Travel.NE_Visits Travel.S_Visits
Min. :0.0 Min. :0.000 Min. :0.0 Min. :0.00 Min. :0.0000 Min. :1.000 Min. :1.000
1st Qu.:0.0 1st Qu.:0.000 1st Qu.:0.0 1st Qu.:0.00 1st Qu.:0.0000 1st Qu.:1.000 1st Qu.:2.000
Median :0.0 Median :0.000 Median :0.0 Median :0.00 Median :0.0000 Median :2.000 Median :2.000
Mean :0.1 Mean :0.275 Mean :0.2 Mean :0.15 Mean :0.1583 Mean :1.867 Mean :2.158
3rd Qu.:0.0 3rd Qu.:1.000 3rd Qu.:0.0 3rd Qu.:0.00 3rd Qu.:0.0000 3rd Qu.:2.000 3rd Qu.:3.000
Max. :1.0 Max. :1.000 Max. :1.0 Max. :1.00 Max. :1.0000 Max. :4.000 Max. :4.000
Travel.W_Visits Travel.Can_Visits Travel.NE_Time Travel.S_Time Travel.W_Time Travel.Can_Time SK2.MichvStand
Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000 Min. :2.000
1st Qu.:1.000 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:5.000
Median :2.000 Median :2.000 Median :1.000 Median :1.000 Median :1.000 Median :1.000 Median :6.000
Mean :1.708 Mean :1.942 Mean :1.333 Mean :1.458 Mean :1.325 Mean :1.158 Mean :5.408
3rd Qu.:2.000 3rd Qu.:2.000 3rd Qu.:1.000 3rd Qu.:2.000 3rd Qu.:1.000 3rd Qu.:1.000 3rd Qu.:6.000
Max. :4.000 Max. :4.000 Max. :4.000 Max. :4.000 Max. :4.000 Max. :4.000 Max. :7.000
SK3.CanvStand IK1.DecideCanada EK1.CanSpeak EK2.CanPron EK3.CanAI EK3.CanAI.Diff EK4.CanAU
Min. :2.000 Min. :2.00 Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
1st Qu.:3.000 1st Qu.:2.00 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:2.000 1st Qu.:1.000
Median :4.000 Median :3.00 Median :1.000 Median :1.000 Median :2.000 Median :2.000 Median :1.000
Mean :4.283 Mean :3.15 Mean :1.033 Mean :1.058 Mean :1.708 Mean :1.783 Mean :1.142
3rd Qu.:5.000 3rd Qu.:4.00 3rd Qu.:1.000 3rd Qu.:1.000 3rd Qu.:2.000 3rd Qu.:2.000 3rd Qu.:1.000
Max. :6.000 Max. :5.00 Max. :2.000 Max. :2.000 Max. :2.000 Max. :2.000 Max. :2.000
EK4.CanAU.Diff SE1.Fam.eh_1 SE1.Fam.oot_1 SE1.Fam.sorry_1 SE1.Fam.washroom_1 SE1.Fam.nasal_1 SE2.Freq.Recent_1
Min. :1.0 Min. :1.0 Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
1st Qu.:1.0 1st Qu.:6.0 1st Qu.:5.000 1st Qu.:3.000 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:2.000
Median :1.0 Median :7.0 Median :7.000 Median :6.000 Median :3.000 Median :2.000 Median :4.000
Mean :1.3 Mean :6.1 Mean :5.683 Mean :4.783 Mean :3.367 Mean :2.825 Mean :3.875
3rd Qu.:2.0 3rd Qu.:7.0 3rd Qu.:7.000 3rd Qu.:7.000 3rd Qu.:5.250 3rd Qu.:4.000 3rd Qu.:5.000
Max. :2.0 Max. :7.0 Max. :7.000 Max. :7.000 Max. :7.000 Max. :7.000 Max. :7.000
SE2.Freq.Child_1 SE2.Freq.Overall_1 SE3.Accuracy SE4.CanHearImitate SE4.auHearImitate SE5.CanImitate
Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
1st Qu.:2.000 1st Qu.:3.000 1st Qu.:4.750 1st Qu.:2.000 1st Qu.:3.000 1st Qu.:1.000
Median :4.000 Median :4.000 Median :5.000 Median :3.000 Median :5.000 Median :2.000
Mean :3.692 Mean :4.092 Mean :4.758 Mean :3.225 Mean :4.608 Mean :2.142
3rd Qu.:5.000 3rd Qu.:5.000 3rd Qu.:5.000 3rd Qu.:4.000 3rd Qu.:6.000 3rd Qu.:3.000
Max. :7.000 Max. :7.000 Max. :7.000 Max. :7.000 Max. :7.000 Max. :6.000
SE5.auImitate PE1.KnowCan PE1.Relatives PE1.CloseFamFriends PE2.CanSpeakFreq.Recent_1
Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.0 Min. :1.000
1st Qu.:1.000 1st Qu.:1.000 1st Qu.:2.000 1st Qu.:1.0 1st Qu.:2.000
Median :2.000 Median :1.000 Median :2.000 Median :2.0 Median :3.000
Mean :2.958 Mean :1.367 Mean :1.867 Mean :1.7 Mean :3.125
3rd Qu.:5.000 3rd Qu.:2.000 3rd Qu.:2.000 3rd Qu.:2.0 3rd Qu.:4.000
Max. :7.000 Max. :2.000 Max. :2.000 Max. :2.0 Max. :7.000
PE2.CanSpeakFreq.Child_1 PE2.CanSpeakFreq.Overall_1 ME2.HockeyFreq ME3.Sources.People_1 ME3.Sources.TV_1
Min. :1.000 Min. :1.00 Min. :1.0 Min. :1.000 Min. :1.000
1st Qu.:1.000 1st Qu.:2.00 1st Qu.:1.0 1st Qu.:1.000 1st Qu.:2.000
Median :2.000 Median :3.00 Median :2.0 Median :2.000 Median :3.000
Mean :2.658 Mean :3.15 Mean :2.4 Mean :2.283 Mean :2.817
3rd Qu.:4.000 3rd Qu.:4.00 3rd Qu.:3.0 3rd Qu.:3.000 3rd Qu.:4.000
Max. :7.000 Max. :7.00 Max. :7.0 Max. :7.000 Max. :6.000
ME3.Sources.News_1 ME3.Sources.Hockey_1 ME3.Sources.Online_1 ME3.Sources.OtherMedia_1 ME3.Sources.Other_1
Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.0 Min. :1.000
1st Qu.:1.000 1st Qu.:1.000 1st Qu.:2.000 1st Qu.:1.0 1st Qu.:1.000
Median :2.000 Median :2.000 Median :3.000 Median :1.0 Median :1.000
Mean :2.042 Mean :2.342 Mean :2.958 Mean :1.3 Mean :1.208
3rd Qu.:3.000 3rd Qu.:3.000 3rd Qu.:4.000 3rd Qu.:1.0 3rd Qu.:1.000
Max. :5.000 Max. :7.000 Max. :7.000 Max. :6.0 Max. :7.000
ME4.CanHearFreq.Recent_1 ME4.CanHearFreq.Child_1 ME4.CanHearFreq.Overall_1 tonesCorrect subject
Min. :1.000 Min. :1.000 Min. :1.000 Min. :0.000 Min. : 1.00
1st Qu.:2.750 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:2.750 1st Qu.: 30.75
Median :4.000 Median :3.000 Median :3.500 Median :6.000 Median : 60.50
Mean :3.717 Mean :3.067 Mean :3.492 Mean :4.392 Mean : 60.50
3rd Qu.:5.000 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:6.000 3rd Qu.: 90.25
Max. :7.000 Max. :7.000 Max. :6.000 Max. :6.000 Max. :120.00
quesdata.final
# Write to file
write.csv(quesdata.final, 'data/axb_1a_lbq_data.csv', row.names=F)
# Total
quesdata.final %>% select(participantId, conditionId, guiseCombination, Age, Gender, Ethnicity, CEscore, SAscore, MSscore, EQscore, EQ.raws) %>%
summarise_if(is.numeric, list(mean = mean, sd = sd, min = min, max = max)) %>%
pivot_longer(everything(), names_to = c("Measure", "Stat"), names_sep = "_", values_to = "value") %>%
pivot_wider(Measure, names_from = "Stat", values_from = "value") %>%
mutate(mean = round(mean, digits = 6)) %>%
mutate(group = "Total") %>%
select(group, everything())
NA
# Baseline
quesdata.final %>% select(participantId, conditionId, guiseCombination, Age, Gender, Ethnicity, CEscore, SAscore, MSscore, EQscore, EQ.raws) %>%
filter(guiseCombination == "baseline") %>%
summarise_if(is.numeric, list(mean = mean, sd = sd, min = min, max = max)) %>%
pivot_longer(everything(), names_to = c("Measure", "Stat"), names_sep = "_", values_to = "value") %>%
pivot_wider(Measure, names_from = "Stat", values_from = "value") %>%
mutate(mean = round(mean, digits = 6)) %>%
mutate(group = "baseline") %>%
select(group, everything())
# Match
quesdata.final %>% select(participantId, conditionId, guiseCombination, Age, Gender, Ethnicity, CEscore, SAscore, MSscore, EQscore, EQ.raws) %>%
filter(guiseCombination == "match") %>%
summarise_if(is.numeric, list(mean = mean, sd = sd, min = min, max = max)) %>%
pivot_longer(everything(), names_to = c("Measure", "Stat"), names_sep = "_", values_to = "value") %>%
pivot_wider(Measure, names_from = "Stat", values_from = "value") %>%
mutate(mean = round(mean, digits = 6)) %>%
mutate(group = "match") %>%
select(group, everything())
# Mismatch
quesdata.final %>% select(participantId, conditionId, guiseCombination, Age, Gender, Ethnicity, CEscore, SAscore, MSscore, EQscore, EQ.raws) %>%
filter(guiseCombination == "mismatch") %>%
summarise_if(is.numeric, list(mean = mean, sd = sd, min = min, max = max)) %>%
pivot_longer(everything(), names_to = c("Measure", "Stat"), names_sep = "_", values_to = "value") %>%
pivot_wider(Measure, names_from = "Stat", values_from = "value") %>%
mutate(mean = round(mean, digits = 6)) %>%
mutate(group = "mismatch") %>%
select(group, everything())
#quesdata.final.sum <-
quesdata.final %>% group_by(guiseCombination) %>%
summarize(#meanEQ = mean(EQscore),
meanIK2.au = mean(IK2.au), meanIK2.ai = mean(IK2.ai),
#meanDecideCan = mean(IK1.DecideCanada),
#meanEhFam = mean(SE1.Fam.eh_1),
meanOotFam = mean(SE1.Fam.oot_1),
meanSEFreq = mean(SE2.Freq.Overall_1), meanSEFreq.C = mean(SE2.Freq.Child_1),
meanSEFreq.R = mean(SE2.Freq.Recent_1),
meanSEAcc = mean(SE3.Accuracy))
#quesdata.final.sum <-
quesdata.final %>% group_by(guiseCombination) %>%
summarize(CanHearFreq = mean(ME4.CanHearFreq.Overall_1),
CanHearFreq.R = mean(ME4.CanHearFreq.Recent_1),
CanHearFreq.C = mean(ME4.CanHearFreq.Child_1),
CanSpeakFreq = mean(PE2.CanSpeakFreq.Overall_1),
CanSpeakFreq.R = mean(PE2.CanSpeakFreq.Recent_1),
CanSpeakFreq.C = mean(PE2.CanSpeakFreq.Child_1))
quesdata.final %>% group_by(guiseCombination) %>% mutate(EK1.CanSpeak = ifelse(EK1.CanSpeak==1, "yes", "no")) %>%
count(EK1.CanSpeak) %>%
pivot_wider(names_from = EK1.CanSpeak, values_from = n) %>%
mutate(question="SpeakDiff") %>% relocate(question)
quesdata.final %>% group_by(guiseCombination) %>% mutate(EK2.CanPron = ifelse(EK2.CanPron==1, "yes", "no")) %>%
count(EK2.CanPron) %>%
pivot_wider(names_from = EK2.CanPron, values_from = n) %>%
mutate(question="PronounceDiff") %>% relocate(question)
quesdata.final %>% group_by(guiseCombination) %>% mutate(EK3.CanAI = ifelse(EK3.CanAI==1, "yes", "no")) %>%
count(EK3.CanAI) %>%
pivot_wider(names_from = EK3.CanAI, values_from = n) %>%
mutate(prop.yes = yes/(yes+no)) %>%
mutate(question="aiDiff") %>% relocate(question)
quesdata.final %>% group_by(guiseCombination) %>% mutate(EK4.CanAU = ifelse(EK4.CanAU==1, "yes", "no")) %>%
count(EK4.CanAU) %>%
pivot_wider(names_from = EK4.CanAU, values_from = n) %>%
mutate(prop.yes = yes/(yes+no)) %>%
mutate(question="auDiff") %>% relocate(question)
# Overall total
quesdata.final %>% mutate(EK3.CanAI = ifelse(EK3.CanAI==1, "yes", "no")) %>% count(EK3.CanAI) %>%
pivot_wider(names_from = EK3.CanAI, values_from = n) %>% mutate(prop.yes = yes/(yes+no)) %>%
mutate(question="aiDiff") %>% relocate(question) %>%
rbind(.,quesdata.final %>% mutate(EK4.CanAU = ifelse(EK4.CanAU==1, "yes", "no")) %>% count(EK4.CanAU) %>%
pivot_wider(names_from = EK4.CanAU, values_from = n)%>% mutate(prop.yes = yes/(yes+no)) %>%
mutate(question="auDiff") %>% relocate(question)
)
NA
# Density Plot of score distributions
quesdata.final %>% mutate(guiseCombination = plyr::mapvalues(guiseCombination, from = c("baseline", "match", "mismatch"), to = c("No-Guise", "Guise-Match", "Guise-Mismatch"))) %>%
ggplot(aes(x=EQscore,fill=guiseCombination,color=guiseCombination))+
geom_density(alpha=0.3) +
labs(title="", x="EQ Score", y="Count") +
gg_theme()
# By Gender
ggplot(data=quesdata.final, aes(x=Gender, y=EQscore)) +
geom_boxplot(aes(linetype = Gender), fill="grey", alpha=0.3, na.rm=TRUE) +
gg_theme()
# By Gender + Guise
ggplot(data=quesdata.final, aes(x=Gender, y=EQscore)) +
geom_boxplot(aes(fill = guiseCombination, color=guiseCombination, linetype=Gender), alpha=0.3, na.rm=TRUE) +
facet_grid(~guiseCombination) +
scale_color_manual(values=ghibli_palette("PonyoMedium")[c(6,3,4)]) +
scale_fill_manual(values=ghibli_palette("PonyoMedium")[c(6,3,4)]) +
gg_theme()
# By Guise
ggplot(data=quesdata.final, aes(x=guiseCombination, y=EQscore)) +
geom_boxplot(aes(fill = guiseCombination, color=guiseCombination), alpha=0.3, na.rm=TRUE) +
scale_color_manual(values=ghibli_palette("PonyoMedium")[c(6,3,4)]) +
scale_fill_manual(values=ghibli_palette("PonyoMedium")[c(6,3,4)]) +
gg_theme()
# Density Plot of score distributions
quesdata.final %>% mutate(guiseCombination = plyr::mapvalues(guiseCombination, from = c("baseline", "match", "mismatch"), to = c("No-Guise", "Guise-Match", "Guise-Mismatch"))) %>%
ggplot(aes(x=SAscore,fill=guiseCombination,color=guiseCombination))+
geom_density(alpha=0.3) +
labs(title="", x="SA Score", y="Count") +
gg_theme()
# By Gender
ggplot(data=quesdata.final, aes(x=Gender, y=SAscore)) +
geom_boxplot(aes(linetype = Gender), fill="grey", alpha=0.3, na.rm=TRUE) +
gg_theme()
# By Gender + Guise
ggplot(data=quesdata.final, aes(x=Gender, y=SAscore)) +
geom_boxplot(aes(fill = guiseCombination, color=guiseCombination, linetype=Gender), alpha=0.3, na.rm=TRUE) +
facet_grid(~guiseCombination) +
scale_color_manual(values=ghibli_palette("PonyoMedium")[c(6,3,4)]) +
scale_fill_manual(values=ghibli_palette("PonyoMedium")[c(6,3,4)]) +
gg_theme()
# By Guise
ggplot(data=quesdata.final, aes(x=guiseCombination, y=SAscore)) +
geom_boxplot(aes(fill = guiseCombination, color=guiseCombination), alpha=0.3, na.rm=TRUE) +
scale_color_manual(values=ghibli_palette("PonyoMedium")[c(6,3,4)]) +
scale_fill_manual(values=ghibli_palette("PonyoMedium")[c(6,3,4)]) +
gg_theme()
# Density Plot of score distributions
quesdata.final %>% mutate(guiseCombination = plyr::mapvalues(guiseCombination, from = c("baseline", "match", "mismatch"), to = c("No-Guise", "Guise-Match", "Guise-Mismatch"))) %>%
ggplot(aes(x=CEscore,fill=guiseCombination,color=guiseCombination))+
geom_density(alpha=0.3) +
labs(title="", x="CE Score", y="Count") +
gg_theme()
# By Gender
ggplot(data=quesdata.final, aes(x=Gender, y=CEscore)) +
geom_boxplot(aes(linetype = Gender), fill="grey", alpha=0.3, na.rm=TRUE) +
gg_theme()
# By Gender + Guise
ggplot(data=quesdata.final, aes(x=Gender, y=CEscore)) +
geom_boxplot(aes(fill = guiseCombination, color=guiseCombination, linetype=Gender), alpha=0.3, na.rm=TRUE) +
facet_grid(~guiseCombination) +
scale_color_manual(values=ghibli_palette("PonyoMedium")[c(6,3,4)]) +
scale_fill_manual(values=ghibli_palette("PonyoMedium")[c(6,3,4)]) +
gg_theme()
# By Guise
ggplot(data=quesdata.final, aes(x=guiseCombination, y=CEscore)) +
geom_boxplot(aes(fill = guiseCombination, color=guiseCombination), alpha=0.3, na.rm=TRUE) +
scale_color_manual(values=ghibli_palette("PonyoMedium")[c(6,3,4)]) +
scale_fill_manual(values=ghibli_palette("PonyoMedium")[c(6,3,4)]) +
gg_theme()
CSCE.biplot <- quesdata.final %>% mutate(guiseCombination = plyr::mapvalues(guiseCombination, from = c("baseline", "match", "mismatch"), to = c("No-Guise", "Guise-Match", "Guise-Mismatch"))) %>%
ggplot(aes(y=CEscore, x=SAscore, color=guiseCombination, shape=guiseCombination)) +
geom_point(na.rm=TRUE, size=3, alpha=0.7) +
#geom_text(aes(label=subject),hjust=1.5,vjust=1.5, color="#00AFBB", check_overlap=T)+
scale_x_continuous(lim=c(-2.5, 2.5),breaks=seq(-3,3,1)) +
scale_y_continuous(lim=c(-3.5, 3.5),breaks=seq(-3,3,1)) +
geom_hline(yintercept=0, linetype="dashed") +
geom_vline(xintercept=0, linetype="dashed") +
labs(y="CE Scores (PC1)", x="SA Scores (PC2)", color="Participant Group", shape="Participant Group") +
scale_color_manual(values=ghibli_palette("PonyoMedium")[c(6,3,4)]) +
gg_theme()
CSCE.biplot
#ggsave(path="plots", filename="CS-CE_distribution_plot.png", CSCE.biplot, width=16, height=8, units = "in" , dpi=72)
quesdata.final %>%
ggplot(aes(x = SAscore, y = EQscore)) +
geom_point(stat="identity", aes(colour = factor(Gender)), cex=2) +
geom_smooth(method="lm") +
labs(y = "EQ Score", x = "SA Score", color="Gender",
title="Correlations: By Speaker") +
scale_color_manual(values=ghibli_palette("PonyoLight")[c(4,3,2)])+
gg_theme()
quesdata.final %>%
ggplot(aes(x = SAscore, y = CEscore)) +
geom_point(stat="identity", aes(colour = factor(Gender)), cex=2) +
geom_smooth(method="lm") +
labs(y = "CE Score", x = "SA Score", color="Gender",
title="Correlations: By Speaker") +
scale_color_manual(values=ghibli_palette("PonyoLight")[c(4,3,2)])+
gg_theme()
quesdata.final %>%
ggplot(aes(x = CEscore, y = EQscore)) +
geom_point(stat="identity", aes(colour = factor(Gender)), cex=2) +
geom_smooth(method="lm") +
labs(y = "EQ Score", x = "CE Score", color="Gender",
title="Correlations: By Speaker") +
scale_color_manual(values=ghibli_palette("PonyoLight")[c(4,3,2)])+
gg_theme()